import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pandas_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Conv1D, MaxPooling1D
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.utils import np_utils
data = pd.read_csv('Data.csv')
#data
Pandas Profiling Report:
profile = ProfileReport(data, title = 'Exploratory Analysis Before Cleaning', explorative =True)
profile.to_file('ExploratoryAnalysisPrecleaning.html')
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Date'],y=data['x1'],name='x1'))
fig.add_trace(go.Scatter(x=data['Date'],y=data['x2'],name='x2'))
fig.add_trace(go.Scatter(x=data['Date'],y=data['y'],name='y'))
Remove the last two rows, which all columns are NaN.
data = data.drop(data.index[[-2,-1]])
Set the outliers as NaN.
outlier_ind = np.argmax(data['x1'])
data.loc[outlier_ind] = data.loc[outlier_ind].replace(4230.0,np.nan).replace(99.99, np.nan).replace(517.0,np.nan)
#data.loc[outlier_ind]
date_NA_count = data.isna().sum()[0]
x1_NA_count = data.isna().sum()[1]
x2_NA_count = data.isna().sum()[2]
y_NA_count = data.isna().sum()[3]
print('Date column has %f missing values' %date_NA_count)
print('x1 column has %f missing values' %x1_NA_count)
print('x2 column has %f missing values' %x2_NA_count)
print('y column has %f missing values' %y_NA_count)
Date column has 0.000000 missing values x1 column has 39.000000 missing values x2 column has 39.000000 missing values y column has 39.000000 missing values
Using interpolate() method to fill missing values in time-series data.
data = data.interpolate()
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Date'],y=data['x1'],name='x1'))
fig.add_trace(go.Scatter(x=data['Date'],y=data['x2'],name='x2'))
fig.add_trace(go.Scatter(x=data['Date'],y=data['y'],name='y'))
Pandas Profiling Report (after data cleaning):
profile2 = ProfileReport(data, title = 'Exploratory Analysis After Cleaning', explorative =True)
profile2.to_file('ExploratoryAnalysisPoscleaning.html')
Convert the y column on a given observation day into a NumPy array.
y = data['y'].values
#y = data[['x1','y']].values
#print(y.shape)
Train (70%), validation (15%), test split (15%)
train_portion = round(y.shape[0]*0.7)
val_portion = round(y.shape[0]*0.15)
train_data = y[:train_portion]
val_data = y[train_portion:train_portion+val_portion]
test_data = y[train_portion+val_portion:]
#print('We have %d training, %d validation, and %d test data points.' % (len(train_data), len(val_data), len(test_data)))
#print(train_data.shape)
#print(val_data.shape)
#print(test_data.shape)
scaler_pred = MinMaxScaler(feature_range = (0,1))
Reshape the data to a 2D array from a scaler array.
#print(train_data.shape)
train_data = train_data.reshape(-1,1)
val_data = val_data.reshape(-1,1)
test_data = test_data.reshape(-1,1)
#print(train_data.shape)
#print(val_data.shape)
#print(test_data.shape)
scaler_pred.fit(train_data)
trainNorm = scaler_pred.transform(train_data)
valNorm = scaler_pred.transform(val_data)
testNorm = scaler_pred.transform(test_data)
#print(trainNorm.shape)
#print(valNorm.shape)
#print(testNorm.shape)
def creatSeq(dataset, look_back, foresight):
X, Y = [], []
for i in range(len(dataset) - look_back - foresight):
obs = dataset[i:(i+look_back),0] # ,0
X.append(obs)
Y.append(dataset[i+(look_back+foresight),0]) # ,0
return np.array(X), np.array(Y)
trainNormX, trainNormY = creatSeq(trainNorm, look_back = 7, foresight = 1)
valNormX, valNormY = creatSeq(valNorm, look_back = 7, foresight = 1)
testNormX, testNormY = creatSeq(testNorm, look_back = 7, foresight = 1)
#print(testNormX.shape)
#print(trainNormY.shape)
#print(valNormX.shape)
#print(valNormY.shape)
#print(testNormX.shape)
#print(testNormY.shape)
If the number of neurons are too large and too much hidden layers, the model will result overfitting. On the other hand, the number of neurons are too small will result underfitting. We noticed that setting the nunmber of neurons as the look back step could reduce the test set error.
model = Sequential()
#n_neurons = trainNormX.shape[1] * trainNormX.shape[2]
model.add(Dense(7, activation = 'linear', input_shape = (trainNormX.shape[1], )))
model.add(Dropout(0.1))
model.add(Dense(7, activation = 'linear'))
model.add(Dropout(0.1))
model.add(Dense(7, activation = 'linear'))
model.add(Dropout(0.1))
model.add(Dense(1, activation = 'linear'))
model.compile(loss = 'mae', optimizer = 'adam', metrics = ['mean_absolute_error'])
#model.summary()
checkpoint = EarlyStopping(monitor='val_loss',patience=5,verbose=1,mode='auto',restore_best_weights=True)
callbacks_list = [checkpoint]
network = model.fit(trainNormX,trainNormY,validation_data = (valNormX, valNormY),
epochs=100,batch_size=64,callbacks=callbacks_list)
Epoch 1/100 5/5 [==============================] - 1s 39ms/step - loss: 0.3461 - mean_absolute_error: 0.3461 - val_loss: 0.2380 - val_mean_absolute_error: 0.2380 Epoch 2/100 5/5 [==============================] - 0s 7ms/step - loss: 0.2542 - mean_absolute_error: 0.2542 - val_loss: 0.1727 - val_mean_absolute_error: 0.1727 Epoch 3/100 5/5 [==============================] - 0s 7ms/step - loss: 0.2220 - mean_absolute_error: 0.2220 - val_loss: 0.1519 - val_mean_absolute_error: 0.1519 Epoch 4/100 5/5 [==============================] - 0s 7ms/step - loss: 0.2286 - mean_absolute_error: 0.2286 - val_loss: 0.1573 - val_mean_absolute_error: 0.1573 Epoch 5/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1999 - mean_absolute_error: 0.1999 - val_loss: 0.1583 - val_mean_absolute_error: 0.1583 Epoch 6/100 5/5 [==============================] - 0s 7ms/step - loss: 0.2042 - mean_absolute_error: 0.2042 - val_loss: 0.1513 - val_mean_absolute_error: 0.1513 Epoch 7/100 5/5 [==============================] - 0s 7ms/step - loss: 0.2072 - mean_absolute_error: 0.2072 - val_loss: 0.1449 - val_mean_absolute_error: 0.1449 Epoch 8/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1749 - mean_absolute_error: 0.1749 - val_loss: 0.1428 - val_mean_absolute_error: 0.1428 Epoch 9/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1957 - mean_absolute_error: 0.1957 - val_loss: 0.1418 - val_mean_absolute_error: 0.1418 Epoch 10/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1885 - mean_absolute_error: 0.1885 - val_loss: 0.1410 - val_mean_absolute_error: 0.1410 Epoch 11/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1649 - mean_absolute_error: 0.1649 - val_loss: 0.1404 - val_mean_absolute_error: 0.1404 Epoch 12/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1657 - mean_absolute_error: 0.1657 - val_loss: 0.1392 - val_mean_absolute_error: 0.1392 Epoch 13/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1868 - mean_absolute_error: 0.1868 - val_loss: 0.1388 - val_mean_absolute_error: 0.1388 Epoch 14/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1897 - mean_absolute_error: 0.1897 - val_loss: 0.1381 - val_mean_absolute_error: 0.1381 Epoch 15/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1773 - mean_absolute_error: 0.1773 - val_loss: 0.1375 - val_mean_absolute_error: 0.1375 Epoch 16/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1719 - mean_absolute_error: 0.1719 - val_loss: 0.1371 - val_mean_absolute_error: 0.1371 Epoch 17/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1579 - mean_absolute_error: 0.1579 - val_loss: 0.1367 - val_mean_absolute_error: 0.1367 Epoch 18/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1751 - mean_absolute_error: 0.1751 - val_loss: 0.1363 - val_mean_absolute_error: 0.1363 Epoch 19/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1719 - mean_absolute_error: 0.1719 - val_loss: 0.1360 - val_mean_absolute_error: 0.1360 Epoch 20/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1654 - mean_absolute_error: 0.1654 - val_loss: 0.1362 - val_mean_absolute_error: 0.1362 Epoch 21/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1575 - mean_absolute_error: 0.1575 - val_loss: 0.1360 - val_mean_absolute_error: 0.1360 Epoch 22/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1642 - mean_absolute_error: 0.1642 - val_loss: 0.1358 - val_mean_absolute_error: 0.1358 Epoch 23/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1618 - mean_absolute_error: 0.1618 - val_loss: 0.1356 - val_mean_absolute_error: 0.1356 Epoch 24/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1604 - mean_absolute_error: 0.1604 - val_loss: 0.1353 - val_mean_absolute_error: 0.1353 Epoch 25/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1600 - mean_absolute_error: 0.1600 - val_loss: 0.1352 - val_mean_absolute_error: 0.1352 Epoch 26/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1551 - mean_absolute_error: 0.1551 - val_loss: 0.1354 - val_mean_absolute_error: 0.1354 Epoch 27/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1528 - mean_absolute_error: 0.1528 - val_loss: 0.1353 - val_mean_absolute_error: 0.1353 Epoch 28/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1461 - mean_absolute_error: 0.1461 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 29/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1527 - mean_absolute_error: 0.1527 - val_loss: 0.1345 - val_mean_absolute_error: 0.1345 Epoch 30/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1546 - mean_absolute_error: 0.1546 - val_loss: 0.1352 - val_mean_absolute_error: 0.1352 Epoch 31/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1490 - mean_absolute_error: 0.1490 - val_loss: 0.1364 - val_mean_absolute_error: 0.1364 Epoch 32/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1527 - mean_absolute_error: 0.1527 - val_loss: 0.1349 - val_mean_absolute_error: 0.1349 Epoch 33/100 5/5 [==============================] - 0s 9ms/step - loss: 0.1462 - mean_absolute_error: 0.1462 - val_loss: 0.1337 - val_mean_absolute_error: 0.1337 Epoch 34/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1438 - mean_absolute_error: 0.1438 - val_loss: 0.1336 - val_mean_absolute_error: 0.1336 Epoch 35/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1418 - mean_absolute_error: 0.1418 - val_loss: 0.1335 - val_mean_absolute_error: 0.1335 Epoch 36/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1499 - mean_absolute_error: 0.1499 - val_loss: 0.1335 - val_mean_absolute_error: 0.1335 Epoch 37/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1463 - mean_absolute_error: 0.1463 - val_loss: 0.1334 - val_mean_absolute_error: 0.1334 Epoch 38/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1513 - mean_absolute_error: 0.1513 - val_loss: 0.1333 - val_mean_absolute_error: 0.1333 Epoch 39/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1423 - mean_absolute_error: 0.1423 - val_loss: 0.1331 - val_mean_absolute_error: 0.1331 Epoch 40/100 5/5 [==============================] - 0s 9ms/step - loss: 0.1425 - mean_absolute_error: 0.1425 - val_loss: 0.1330 - val_mean_absolute_error: 0.1330 Epoch 41/100 5/5 [==============================] - 0s 9ms/step - loss: 0.1372 - mean_absolute_error: 0.1372 - val_loss: 0.1329 - val_mean_absolute_error: 0.1329 Epoch 42/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1500 - mean_absolute_error: 0.1500 - val_loss: 0.1325 - val_mean_absolute_error: 0.1325 Epoch 43/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1443 - mean_absolute_error: 0.1443 - val_loss: 0.1324 - val_mean_absolute_error: 0.1324 Epoch 44/100 5/5 [==============================] - 0s 9ms/step - loss: 0.1435 - mean_absolute_error: 0.1435 - val_loss: 0.1322 - val_mean_absolute_error: 0.1322 Epoch 45/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1455 - mean_absolute_error: 0.1455 - val_loss: 0.1321 - val_mean_absolute_error: 0.1321 Epoch 46/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1440 - mean_absolute_error: 0.1440 - val_loss: 0.1322 - val_mean_absolute_error: 0.1322 Epoch 47/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1441 - mean_absolute_error: 0.1441 - val_loss: 0.1325 - val_mean_absolute_error: 0.1325 Epoch 48/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1409 - mean_absolute_error: 0.1409 - val_loss: 0.1326 - val_mean_absolute_error: 0.1326 Epoch 49/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1462 - mean_absolute_error: 0.1462 - val_loss: 0.1325 - val_mean_absolute_error: 0.1325 Epoch 50/100 1/5 [=====>........................] - ETA: 0s - loss: 0.1291 - mean_absolute_error: 0.1291Restoring model weights from the end of the best epoch: 45. 5/5 [==============================] - 0s 9ms/step - loss: 0.1455 - mean_absolute_error: 0.1455 - val_loss: 0.1325 - val_mean_absolute_error: 0.1325 Epoch 50: early stopping
From the error plot, we need to stop the iteration at 50 to prevent overfitting.
valMae = round(network.history['val_loss'][-1],2)
fig = go.Figure()
fig.add_trace(go.Scatter(y=network.history['loss'],mode='lines',name='Training Error'))
fig.add_trace(go.Scatter(y=network.history['val_loss'],mode='lines',name='Validation Error'))
fig.update_layout(yaxis_title='Mean Absolute Error',xaxis_title = 'epoch', title_text = 'Normalized MAE Validation = '
+ str(valMae))
fig.show()
# Get the predicted values
y_pred_scaled = model.predict(testNormX)
# Unscale the predicted values
y_pred = scaler_pred.inverse_transform(y_pred_scaled)
y_test_unscaled = scaler_pred.inverse_transform(testNormY.reshape(-1, 1))
# Mean Absolute Error (MAE)
MAE = np.mean(tf.keras.metrics.mean_absolute_error(y_test_unscaled, y_pred))
print(f'Mean Absolute Error (MAE): {np.round(MAE, 2)}')
# Mean Absolute Percentage Error (MAPE)
MAPE = np.mean((np.abs(np.subtract(y_test_unscaled, y_pred)/ y_test_unscaled))) * 100
print(f'Mean Absolute Percentage Error (MAPE): {np.round(MAPE, 2)} %')
# Median Absolute Percentage Error (MDAPE)
MDAPE = np.median((np.abs(np.subtract(y_test_unscaled, y_pred)/ y_test_unscaled)) ) * 100
print(f'Median Absolute Percentage Error (MDAPE): {np.round(MDAPE, 2)} %')
Mean Absolute Error (MAE): 6.550000190734863 Mean Absolute Percentage Error (MAPE): 58.16 % Median Absolute Percentage Error (MDAPE): 6.23 %
fig = go.Figure()
fig.add_trace(go.Scatter(y=y_pred.reshape(-1,),mode='markers',name='Model Predictions on Test Set'))
fig.add_trace(go.Scatter(y=y_test_unscaled.reshape(-1,),mode='markers',name='Target Values for the Test Set'))
fig.update_layout(title_text = 'Unnormalized MAE Test = ' + str(MAE))
fig.show()
pred_y_norm = np.vstack((testNorm[-8:-1,0], testNorm[-7:,0]))
pred_y = model.predict(pred_y_norm)
pred_y_unnorm = scaler_pred.inverse_transform(pred_y)
print('y = %d on March 1st, y = %d on March 2nd' %(pred_y_unnorm[0], pred_y_unnorm[1]))
y = 73 on March 1st, y = 71 on March 2nd
#X = data[['x1','x2','y']].values
X = data[['x1','y']].values
#X = data[['x2','y']].values
#X.shape
trainPortion = round(X.shape[0]*0.7)
valPortion = round(X.shape[0]*0.15)
trainData = X[:trainPortion]
valData = X[trainPortion: trainPortion+valPortion]
testData = X[trainPortion+valPortion:]
print('We have %d training, %d validation, and %d test data points.' % (len(trainData), len(valData), len(testData)))
#print(trainData.shape)
#print(valData.shape)
#print(testData.shape)
We have 298 training, 64 validation, and 63 test data points.
sc = MinMaxScaler(feature_range = (0,1))
sc.fit(trainData)
trainNorm = sc.transform(trainData)
valNorm = sc.transform(valData)
testNorm = sc.transform(testData)
#print(trainNorm.shape)
#print(valNorm.shape)
#print(testNorm.shape)
scaler_pred = MinMaxScaler()
testNormY = testData[:,1].reshape(-1,1)
scaler_pred.fit(testNormY)
testNormYScaled = scaler_pred.transform(testNormY)
#print(testNormYScaled.shape)
def creatSeq(dataset, look_back, foresight):
X, Y = [], []
for i in range(len(dataset) - look_back - foresight):
obs = dataset[i:(i+look_back)] # sequence of 'look_back'
X.append(obs)
Y.append(dataset[i+(look_back+foresight), 1])
return np.array(X), np.array(Y)
trainNormX, trainNormY = creatSeq(trainNorm, 7, 2)
valNormX, valNormY = creatSeq(valNorm, 7, 2)
testNormX, testNormY = creatSeq(testNorm, 7, 2)
#print(trainNormX.shape, trainNormY.shape)
#print(valNormX.shape, valNormY.shape)
#print(testNormX.shape, testNormY.shape)
If the number of neurons are too large and too much hidden layers, the model will result overfitting. On the other hand, the number of neurons are too small will result underfitting. We noticed that setting the nunmber of neurons as the look back step * number of foresight could reduce the test set error.
model = Sequential()
n_neurons = trainNormX.shape[1] * trainNormX.shape[2]
model.add(GRU(n_neurons, input_shape=(trainNormX.shape[1], trainNormX.shape[2]), dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(n_neurons, activation = 'linear'))
model.add(Dense(7, activation = 'linear'))
model.add(Dense(1, activation = 'linear'))
model.compile(loss = 'mae', optimizer = 'adam', metrics = ['mean_absolute_error'])
#model.summary()
checkpoint = EarlyStopping(monitor='val_loss',patience=5,verbose=1,mode='auto',restore_best_weights=True)
callbacks_list = [checkpoint]
network = model.fit(trainNormX,trainNormY, validation_data = (valNormX, valNormY),
epochs=100,batch_size=64,callbacks=callbacks_list)
Epoch 1/100 5/5 [==============================] - 2s 75ms/step - loss: 0.2448 - mean_absolute_error: 0.2448 - val_loss: 0.1823 - val_mean_absolute_error: 0.1823 Epoch 2/100 5/5 [==============================] - 0s 10ms/step - loss: 0.1667 - mean_absolute_error: 0.1667 - val_loss: 0.1306 - val_mean_absolute_error: 0.1306 Epoch 3/100 5/5 [==============================] - 0s 10ms/step - loss: 0.1601 - mean_absolute_error: 0.1601 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 4/100 5/5 [==============================] - 0s 10ms/step - loss: 0.1631 - mean_absolute_error: 0.1631 - val_loss: 0.1319 - val_mean_absolute_error: 0.1319 Epoch 5/100 5/5 [==============================] - 0s 10ms/step - loss: 0.1490 - mean_absolute_error: 0.1490 - val_loss: 0.1315 - val_mean_absolute_error: 0.1315 Epoch 6/100 5/5 [==============================] - 0s 9ms/step - loss: 0.1467 - mean_absolute_error: 0.1467 - val_loss: 0.1348 - val_mean_absolute_error: 0.1348 Epoch 7/100 1/5 [=====>........................] - ETA: 0s - loss: 0.1435 - mean_absolute_error: 0.1435Restoring model weights from the end of the best epoch: 2. 5/5 [==============================] - 0s 10ms/step - loss: 0.1466 - mean_absolute_error: 0.1466 - val_loss: 0.1339 - val_mean_absolute_error: 0.1339 Epoch 7: early stopping
From the error plot, we need to stop the iteration at 7 to prevent overfitting.
valMae = round(network.history['val_loss'][-1],2)
fig = go.Figure()
fig.add_trace(go.Scatter(y=network.history['loss'],mode='lines',name='Training Error'))
fig.add_trace(go.Scatter(y=network.history['val_loss'],mode='lines',name='Validation Error'))
fig.update_layout(yaxis_title='Mean Absolute Error',xaxis_title = 'epoch', title_text = 'Normalized MAE Validation = '
+ str(valMae))
fig.show()
# Get the predicted values
y_pred_scaled = model.predict(testNormX)
# Unscale the predicted values
y_pred = scaler_pred.inverse_transform(y_pred_scaled)
y_test_unscaled = scaler_pred.inverse_transform(testNormY.reshape(-1, 1))
# Mean Absolute Error (MAE)
MAE = np.mean(tf.keras.metrics.mean_absolute_error(y_test_unscaled, y_pred))
print(f'Mean Absolute Error (MAE): {np.round(MAE, 2)}')
# Mean Absolute Percentage Error (MAPE)
MAPE = np.mean((np.abs(np.subtract(y_test_unscaled, y_pred)/ y_test_unscaled))) * 100
print(f'Mean Absolute Percentage Error (MAPE): {np.round(MAPE, 2)} %')
# Median Absolute Percentage Error (MDAPE)
MDAPE = np.median((np.abs(np.subtract(y_test_unscaled, y_pred)/ y_test_unscaled)) ) * 100
print(f'Median Absolute Percentage Error (MDAPE): {np.round(MDAPE, 2)} %')
Mean Absolute Error (MAE): 13.289999961853027 Mean Absolute Percentage Error (MAPE): 43.67 % Median Absolute Percentage Error (MDAPE): 19.94 %
fig = go.Figure()
fig.add_trace(go.Scatter(y=y_pred.reshape(-1,),mode='markers',name='Model Predictions on Test Set'))
fig.add_trace(go.Scatter(y=y_test_unscaled.reshape(-1,),mode='markers',name='Target Values for the Test Set'))
fig.update_layout(title_text = 'Unnormalized MAE Test = ' + str(MAE))
fig.show()
pred_y_norm = np.vstack((testNorm[-8:-1,:], testNorm[-7:,:])).reshape(2,7,2)
pred_y_norm
pred_y = model.predict(pred_y_norm)
pred_y_unnorm = scaler_pred.inverse_transform(pred_y)
print('y = %d on March 1st, y = %d on March 2nd' %(float(pred_y_unnorm[0]), pred_y_unnorm[1]))
y = 32 on March 1st, y = 40 on March 2nd
y = data['y'].values
train_portion = round(y.shape[0]*0.7)
val_portion = round(y.shape[0]*0.15)
train_data = y[:train_portion]
val_data = y[train_portion:train_portion+val_portion]
test_data = y[train_portion+val_portion:]
#print('We have %d training, %d validation, and %d test data points.' % (len(train_data), len(val_data), len(test_data)))
#print(train_data.shape)
#print(val_data.shape)
#print(test_data.shape)
scaler_pred = MinMaxScaler(feature_range = (0,1))
Reshape the data to a 2D array from a scaler array.
#print(train_data.shape)
train_data = train_data.reshape(-1,1)
val_data = val_data.reshape(-1,1)
test_data = test_data.reshape(-1,1)
#print(train_data.shape)
#print(val_data.shape)
#print(test_data.shape)
scaler_pred.fit(train_data)
trainNorm = scaler_pred.transform(train_data)
valNorm = scaler_pred.transform(val_data)
testNorm = scaler_pred.transform(test_data)
#print(trainNorm.shape)
#print(valNorm.shape)
#print(testNorm.shape)
def creatSeq(dataset, look_back, foresight):
X, Y = [], []
for i in range(len(dataset) - look_back - foresight):
obs = dataset[i:(i+look_back),0] # ,0
X.append(obs)
Y.append(dataset[i+(look_back+foresight),0]) # ,0
return np.array(X), np.array(Y)
trainNormX, trainNormY = creatSeq(trainNorm, look_back = 7, foresight = 1)
valNormX, valNormY = creatSeq(valNorm, look_back = 7, foresight = 1)
testNormX, testNormY = creatSeq(testNorm, look_back = 7, foresight = 1)
#print(testNormX.shape)
#print(trainNormY.shape)
#print(valNormX.shape)
#print(valNormY.shape)
#print(testNormX.shape)
#print(testNormY.shape)
If the number of neurons are too large and too much hidden layers, the model will result overfitting. On the other hand, the number of neurons are too small will result underfitting. We noticed that setting the nunmber of neurons as 64 in the fist layer, and the look back step as the second layer could reduce the test set error.
model = Sequential()
model.add(Conv1D(64, kernel_size = 5, input_shape = (7,1), activation = 'linear'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Dense(7, activation = 'linear'))
model.add(Dense(1, activation = 'linear'))
model.compile(loss = 'mae', optimizer = 'adam', metrics = ['mean_absolute_error'])
#model.summary()
checkpoint = EarlyStopping(monitor='val_loss',patience=5,verbose=1,mode='auto',restore_best_weights=True)
callbacks_list = [checkpoint]
network = model.fit(trainNormX,trainNormY, validation_data = (valNormX, valNormY),
epochs=100,batch_size=64,callbacks=callbacks_list)
Epoch 1/100 5/5 [==============================] - 0s 32ms/step - loss: 0.2998 - mean_absolute_error: 0.2998 - val_loss: 0.2054 - val_mean_absolute_error: 0.2054 Epoch 2/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1647 - mean_absolute_error: 0.1647 - val_loss: 0.1383 - val_mean_absolute_error: 0.1383 Epoch 3/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1500 - mean_absolute_error: 0.1500 - val_loss: 0.1632 - val_mean_absolute_error: 0.1632 Epoch 4/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1632 - mean_absolute_error: 0.1632 - val_loss: 0.1517 - val_mean_absolute_error: 0.1517 Epoch 5/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1461 - mean_absolute_error: 0.1461 - val_loss: 0.1376 - val_mean_absolute_error: 0.1376 Epoch 6/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1404 - mean_absolute_error: 0.1404 - val_loss: 0.1385 - val_mean_absolute_error: 0.1385 Epoch 7/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1437 - mean_absolute_error: 0.1437 - val_loss: 0.1391 - val_mean_absolute_error: 0.1391 Epoch 8/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1415 - mean_absolute_error: 0.1415 - val_loss: 0.1364 - val_mean_absolute_error: 0.1364 Epoch 9/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1386 - mean_absolute_error: 0.1386 - val_loss: 0.1372 - val_mean_absolute_error: 0.1372 Epoch 10/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1388 - mean_absolute_error: 0.1388 - val_loss: 0.1371 - val_mean_absolute_error: 0.1371 Epoch 11/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1383 - mean_absolute_error: 0.1383 - val_loss: 0.1362 - val_mean_absolute_error: 0.1362 Epoch 12/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1375 - mean_absolute_error: 0.1375 - val_loss: 0.1359 - val_mean_absolute_error: 0.1359 Epoch 13/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1367 - mean_absolute_error: 0.1367 - val_loss: 0.1358 - val_mean_absolute_error: 0.1358 Epoch 14/100 5/5 [==============================] - 0s 13ms/step - loss: 0.1360 - mean_absolute_error: 0.1360 - val_loss: 0.1358 - val_mean_absolute_error: 0.1358 Epoch 15/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1362 - mean_absolute_error: 0.1362 - val_loss: 0.1357 - val_mean_absolute_error: 0.1357 Epoch 16/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1355 - mean_absolute_error: 0.1355 - val_loss: 0.1357 - val_mean_absolute_error: 0.1357 Epoch 17/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1350 - mean_absolute_error: 0.1350 - val_loss: 0.1355 - val_mean_absolute_error: 0.1355 Epoch 18/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1347 - mean_absolute_error: 0.1347 - val_loss: 0.1357 - val_mean_absolute_error: 0.1357 Epoch 19/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1350 - mean_absolute_error: 0.1350 - val_loss: 0.1354 - val_mean_absolute_error: 0.1354 Epoch 20/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1343 - mean_absolute_error: 0.1343 - val_loss: 0.1352 - val_mean_absolute_error: 0.1352 Epoch 21/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1343 - mean_absolute_error: 0.1343 - val_loss: 0.1354 - val_mean_absolute_error: 0.1354 Epoch 22/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1339 - mean_absolute_error: 0.1339 - val_loss: 0.1351 - val_mean_absolute_error: 0.1351 Epoch 23/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1336 - mean_absolute_error: 0.1336 - val_loss: 0.1350 - val_mean_absolute_error: 0.1350 Epoch 24/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1336 - mean_absolute_error: 0.1336 - val_loss: 0.1350 - val_mean_absolute_error: 0.1350 Epoch 25/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1331 - mean_absolute_error: 0.1331 - val_loss: 0.1349 - val_mean_absolute_error: 0.1349 Epoch 26/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1329 - mean_absolute_error: 0.1329 - val_loss: 0.1350 - val_mean_absolute_error: 0.1350 Epoch 27/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1329 - mean_absolute_error: 0.1329 - val_loss: 0.1349 - val_mean_absolute_error: 0.1349 Epoch 28/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1324 - mean_absolute_error: 0.1324 - val_loss: 0.1348 - val_mean_absolute_error: 0.1348 Epoch 29/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1327 - mean_absolute_error: 0.1327 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 30/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1326 - mean_absolute_error: 0.1326 - val_loss: 0.1348 - val_mean_absolute_error: 0.1348 Epoch 31/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1327 - mean_absolute_error: 0.1327 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 32/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1326 - mean_absolute_error: 0.1326 - val_loss: 0.1350 - val_mean_absolute_error: 0.1350 Epoch 33/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1323 - mean_absolute_error: 0.1323 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 34/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1327 - mean_absolute_error: 0.1327 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 35/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1323 - mean_absolute_error: 0.1323 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 36/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1324 - mean_absolute_error: 0.1324 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 37/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1325 - mean_absolute_error: 0.1325 - val_loss: 0.1350 - val_mean_absolute_error: 0.1350 Epoch 38/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1323 - mean_absolute_error: 0.1323 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 39/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1318 - mean_absolute_error: 0.1318 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 40/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1328 - mean_absolute_error: 0.1328 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 41/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1324 - mean_absolute_error: 0.1324 - val_loss: 0.1345 - val_mean_absolute_error: 0.1345 Epoch 42/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1325 - mean_absolute_error: 0.1325 - val_loss: 0.1347 - val_mean_absolute_error: 0.1347 Epoch 43/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1323 - mean_absolute_error: 0.1323 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 44/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1319 - mean_absolute_error: 0.1319 - val_loss: 0.1344 - val_mean_absolute_error: 0.1344 Epoch 45/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1329 - mean_absolute_error: 0.1329 - val_loss: 0.1344 - val_mean_absolute_error: 0.1344 Epoch 46/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1318 - mean_absolute_error: 0.1318 - val_loss: 0.1351 - val_mean_absolute_error: 0.1351 Epoch 47/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1328 - mean_absolute_error: 0.1328 - val_loss: 0.1352 - val_mean_absolute_error: 0.1352 Epoch 48/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1324 - mean_absolute_error: 0.1324 - val_loss: 0.1343 - val_mean_absolute_error: 0.1343 Epoch 49/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1323 - mean_absolute_error: 0.1323 - val_loss: 0.1344 - val_mean_absolute_error: 0.1344 Epoch 50/100 5/5 [==============================] - 0s 6ms/step - loss: 0.1324 - mean_absolute_error: 0.1324 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 51/100 5/5 [==============================] - 0s 6ms/step - loss: 0.1322 - mean_absolute_error: 0.1322 - val_loss: 0.1344 - val_mean_absolute_error: 0.1344 Epoch 52/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1319 - mean_absolute_error: 0.1319 - val_loss: 0.1343 - val_mean_absolute_error: 0.1343 Epoch 53/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1320 - mean_absolute_error: 0.1320 - val_loss: 0.1343 - val_mean_absolute_error: 0.1343 Epoch 54/100 5/5 [==============================] - 0s 6ms/step - loss: 0.1328 - mean_absolute_error: 0.1328 - val_loss: 0.1343 - val_mean_absolute_error: 0.1343 Epoch 55/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1318 - mean_absolute_error: 0.1318 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 56/100 5/5 [==============================] - 0s 8ms/step - loss: 0.1323 - mean_absolute_error: 0.1323 - val_loss: 0.1350 - val_mean_absolute_error: 0.1350 Epoch 57/100 5/5 [==============================] - 0s 7ms/step - loss: 0.1318 - mean_absolute_error: 0.1318 - val_loss: 0.1343 - val_mean_absolute_error: 0.1343 Epoch 58/100 1/5 [=====>........................] - ETA: 0s - loss: 0.1356 - mean_absolute_error: 0.1356Restoring model weights from the end of the best epoch: 53. 5/5 [==============================] - 0s 8ms/step - loss: 0.1327 - mean_absolute_error: 0.1327 - val_loss: 0.1346 - val_mean_absolute_error: 0.1346 Epoch 58: early stopping
From the error plot, we need to stop the iteration at 58 to prevent overfitting.
valMae = round(network.history['val_loss'][-1],2)
fig = go.Figure()
fig.add_trace(go.Scatter(y=network.history['loss'],mode='lines',name='Training Error'))
fig.add_trace(go.Scatter(y=network.history['val_loss'],mode='lines',name='Validation Error'))
fig.update_layout(yaxis_title='Mean Absolute Error',xaxis_title = 'epoch', title_text = 'Normalized MAE Validation = '
+ str(valMae))
fig.show()
testNormPred = model.predict(testNormX)
testPred = scaler_pred.inverse_transform(testNormPred.reshape(-1,1))
testY = scaler_pred.inverse_transform(testNormY.reshape(-1,1))
testMae = tf.keras.metrics.mean_absolute_error(testY, testPred)
fig = go.Figure()
fig.add_trace(go.Scatter(y=testPred.reshape(-1,),mode='markers',name='Model Predictions on Test Set'))
fig.add_trace(go.Scatter(y=testY.reshape(-1,),mode='markers',name='Target Values for the Test Set'))
fig.update_layout(title_text = 'Unnormalized MAE Test = ' + str(np.mean(testMae)))
fig.show()
pred_y_norm = np.vstack((testNorm[-8:-1,0], testNorm[-7:,0]))
pred_y_norm
pred_y = model.predict(pred_y_norm)
pred_y_unnorm = scaler_pred.inverse_transform(pred_y.reshape(-1,1))
print('y = %d on March 1st, y = %d on March 2nd' %(float(pred_y_unnorm[0]), pred_y_unnorm[1]))
y = 74 on March 1st, y = 73 on March 2nd
X = data[['x1','x2','y']]
from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.var_model import VARResults
from statsmodels.tsa.stattools import grangercausalitytests
model = VAR(X)
The optimum lag order p = 1
lag_order = model.select_order(15)
print(lag_order.summary())
VAR Order Selection (* highlights the minimums)
==================================================
AIC BIC FPE HQIC
--------------------------------------------------
0 8.806 8.836 6677. 8.818
1 8.681* 8.799* 5893.* 8.728*
2 8.689 8.895 5938. 8.771
3 8.714 9.008 6086. 8.830
4 8.738 9.120 6239. 8.890
5 8.778 9.248 6489. 8.964
6 8.774 9.332 6463. 8.995
7 8.737 9.383 6230. 8.993
8 8.754 9.488 6337. 9.044
9 8.762 9.585 6389. 9.087
10 8.785 9.696 6541. 9.145
11 8.793 9.792 6596. 9.188
12 8.810 9.897 6709. 9.240
13 8.825 10.00 6816. 9.290
14 8.789 10.05 6574. 9.289
15 8.815 10.17 6756. 9.350
--------------------------------------------------
results = model.fit(1)
#print(results.summary())
The model is stationary since the absolute of all roots are greater than 1
nroots = 3*1 # k*p
print('Roots =')
for i in range(nroots):
print(results.roots[i])
print()
print('Moduli =')
for i in range(nroots):
print(np.absolute(results.roots[i]))
Roots = 12.099710553456287 5.516727220519573 2.9802842676386483 Moduli = 12.099710553456287 5.516727220519573 2.9802842676386483
# grangercausalitytests(X[['x1','x2']],3) # x2 did not cause x1
# grangercausalitytests(X[['x1','y']],3) # y did not cause x1
# grangercausalitytests(X[['y','x1']],3) # x1 did not cause y
# grangercausalitytests(X[['x2','x1']],3) # x1 causes x2
# grangercausalitytests(X[['y','x2']],3) # x2 causes y
lag_order = results.k_ar
print('Lag order =', lag_order)
forecast_values = results.forecast(X.values[-lag_order:],2)
print()
print('Forecast values:')
print(forecast_values[:,2])
Lag order = 1 Forecast values: [70.67772124 72.29876426]
With the assessment of differnt models for predicting y from the "data.csv" file, we decided to train the model using 1d convolutional neural network. From the error plots, the model using 1d convolutional neural network has the lowest test error, which is around 5.7. In addition, we used this model to predict y = 74 on March 1st, and y = 73 on March 2nd.